
from paddle.fluid.dygraph import layers
from paddle.framework import get_default_dtype, set_default_dtype
from paddle.fluid.initializer import Constant
from paddle.fluid.param_attr import ParamAttr
from paddle.nn.functional import batch_norm, instance_norm

class _BatchNormBase(layers.Layer):
    """
    BatchNorm base .
    """

    def __init__(self,
                 num_features,
                 momentum=0.9,
                 epsilon=1e-05,
                 weight_attr=None,
                 bias_attr=None,
                 data_format='NCHW',
                 use_global_stats=None,
                 name=None):
        super(_BatchNormBase, self).__init__()
        self._num_features = num_features
        self._weight_attr = weight_attr
        self._bias_attr = bias_attr
        self._use_global_stats = use_global_stats

        if get_default_dtype() == 'float16':
            set_default_dtype('float32')

        param_shape = [num_features]

        # create parameter
        if weight_attr == False:
            self.weight = self.create_parameter(
                attr=None, shape=param_shape, default_initializer=Constant(1.0))
            self.weight.stop_gradient = True
        else:
            self.weight = self.create_parameter(
                attr=self._weight_attr,
                shape=param_shape,
                default_initializer=Constant(1.0))
            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.

        if bias_attr == False:
            self.bias = self.create_parameter(
                attr=None,
                shape=param_shape,
                default_initializer=Constant(0.0),
                is_bias=True)
            self.bias.stop_gradient = True
        else:
            self.bias = self.create_parameter(
                attr=self._bias_attr, shape=param_shape, is_bias=True)
            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.

        moving_mean_name = None
        moving_variance_name = None

        if name is not None:
            moving_mean_name = name + "_mean"
            moving_variance_name = name + "_variance"

        self.running_mean = self.create_parameter(
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
                trainable=False,
                do_model_average=True),
            shape=param_shape)
        self.running_mean.stop_gradient = True

        self.running_var = self.create_parameter(
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
                trainable=False,
                do_model_average=True),
            shape=param_shape)
        self.running_var.stop_gradient = True

        self._data_format = data_format
        self._in_place = False
        self._momentum = momentum
        self._epsilon = epsilon
        self._fuse_with_relu = False
        self._name = name

    def _check_input_dim(self, input):
        raise NotImplementedError("BatchNorm Base error")

    def _check_data_format(self, input):
        raise NotImplementedError("BatchNorm Base data format error")

    def forward(self, input):

        self._check_data_format(self._data_format)

        self._check_input_dim(input)

        if self.training:
            print(
                "When training, we now always track global mean and variance.")

        return batch_norm(
            input,
            self.running_mean,
            self.running_var,
            weight=self.weight,
            bias=self.bias,
            training=self.training,
            momentum=self._momentum,
            epsilon=self._epsilon,
            data_format=self._data_format,
            use_global_stats=self._use_global_stats)


class BatchNorm2D(_BatchNormBase):
    r"""
    Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .

    When use_global_stats = False, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch.
    Calculated as follows:

    ..  math::

        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
        \ mini-batch\ mean \\\\
        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\

    When use_global_stats = True, the :math:`\\mu_{\\beta}`
    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
    They are global or running statistics (moving_mean and moving_variance). It usually got from the
    pre-trained model. Calculated as follows:

    .. math::
        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\

    The normalization function formula is as follows:

    ..  math::

        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift

    - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero
    - :math:`\\gamma` : trainable proportional parameter
    - :math:`\\beta` : trainable deviation parameter

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
            If it is set to None or one attribute of ParamAttr, batch_norm
            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
        data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
        use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 4-D tensor with shape: (batch, num_features, height, weight) when data_format is "NCHW",
            or (batch, height, weight, num_features) when data_format is "NHWC".
        - output: 4-D tensor with same shape as input x.

    Returns:
        None

    Examples:
        .. code-block:: python

          import paddle
          import numpy as np

          np.random.seed(123)
          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
          x = paddle.to_tensor(x_data) 
          batch_norm = paddle.nn.BatchNorm2D(1)
          batch_norm_out = batch_norm(x)

          print(batch_norm_out)
    """

    def _check_data_format(self, input):
        if input == 'NCHW':
            self._data_format = input
        elif input == "NHWC":
            self._data_format = input
        else:
            raise ValueError('expected NCHW or NHWC for data_format input')

    def _check_input_dim(self, input):
        if len(input.shape) != 4:
            raise ValueError('expected 4D input (got {}D input)'.format(
                len(input.shape)))
    
    def state_dict(self,
                   destination=None,
                   include_sublayers=True,
                   structured_name_prefix=""):
        import collections
        import pickle
        import os
        param_path = 'param_names.pkl'
        param_names = {}
        if os.path.exists(param_path):
            param_names = pickle.load(open(param_path, 'rb'))


        if destination is None:
            destination = collections.OrderedDict()
        for name, data in self._parameters.items():

            new_name = name
            if name.endswith('running_mean'):
                new_name = name.replace('running_mean', '_mean')
            elif name.endswith('running_var'):
                new_name = name.replace('running_var', '_variance')
            param_names[structured_name_prefix + name] = structured_name_prefix + new_name
            if data is not None:
                destination[structured_name_prefix + name] = data
        for name, buffer in self._buffers.items():
            if buffer is not None and name not in self._non_persistable_buffer_names_set:
                destination[structured_name_prefix + name] = buffer

        pickle.dump(param_names, open(param_path, 'wb'))
        if include_sublayers:
            for layer_name, layer_item in self._sub_layers.items():
                if layer_item is not None:
                    destination_temp = destination.copy()
                    destination_temp.update(
                        layer_item.state_dict(
                            destination_temp, include_sublayers,
                            structured_name_prefix + layer_name + "."))
                    destination = destination_temp
        return destination


class _InstanceNormBase(layers.Layer):
    """
    This class is based class for InstanceNorm1D, 2d, 3d. 

    See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
    """

    def __init__(self,
                 num_features,
                 epsilon=1e-5,
                 momentum=0.9,
                 weight_attr=None,
                 bias_attr=None,
                 data_format="NCHW",
                 name=None):
        super(_InstanceNormBase, self).__init__()

        if weight_attr == False or bias_attr == False:
            assert weight_attr == bias_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
        self._epsilon = epsilon
        self._weight_attr = weight_attr
        self._bias_attr = bias_attr

        if weight_attr != False and bias_attr != False:
            self.weight = self.create_parameter(
                attr=self._weight_attr,
                shape=[num_features],
                default_initializer=Constant(1.0),
                is_bias=False)
            self.bias = self.create_parameter(
                attr=self._bias_attr,
                shape=[num_features],
                default_initializer=Constant(0.0),
                is_bias=True)
        else:
            self.weight = None
            self.bias = None

    def _check_input_dim(self, input):
        raise NotImplementedError("InstanceNorm Base error")

    def forward(self, input):
        self._check_input_dim(input)

        return instance_norm(
            input, weight=self.weight, bias=self.bias, eps=self._epsilon)


class InstanceNorm2D(_InstanceNormBase):
    r"""
    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .

    DataLayout: NCHW `[batch, in_channels, in_height, in_width]`


    :math:`input` is the input features over a mini-batch.

    ..  math::
        
        \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\
        \\ mean\ of\ one\  feature\ map\ in\ mini-batch \\\\
        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\
        \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\
        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift

    Note:
        `H` means height of feature map, `W` means width of feature map.

    Parameters:
        num_features(int): Indicate the number of channels of the input ``Tensor``.
        epsilon(float, optional): A value added to the denominator for
            numerical stability. Default is 1e-5.
        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
	     If the Initializer of the weight_attr is not set, the parameter is initialized 
	     one. If it is set to False, will not create weight_attr. Default: None.
        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of instance_norm.
             If it is set to None or one attribute of ParamAttr, instance_norm
	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
             If it is set to False, will not create bias_attr. Default: None.
        data_format(str, optional): Specify the input data format, could be "NCHW". Default: NCHW.
        name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..

    Shape:
        - x: 4-D tensor with shape: (batch, num_features, height, weight).
        - output: 4-D tensor with same shape as input x.

    Returns:
        None.


    Examples:

        .. code-block:: python

          import paddle
          import numpy as np

          np.random.seed(123)
          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
          x = paddle.to_tensor(x_data) 
          instance_norm = paddle.nn.InstanceNorm2D(2)
          instance_norm_out = instance_norm(x)

          print(instance_norm_out)
    """

    def _check_input_dim(self, input):
        if len(input.shape) != 4:
            raise ValueError('expected 4D input (got {}D input)'.format(
                len(input.shape)))
    
    def state_dict(self,
                   destination=None,
                   include_sublayers=True,
                   structured_name_prefix=""):
        import collections
        import pickle
        import os
        param_path = 'param_names.pkl'
        param_names = {}
        if os.path.exists(param_path):
            param_names = pickle.load(open(param_path, 'rb'))

        if destination is None:
            destination = collections.OrderedDict()
        for name, data in self._parameters.items():

            new_name = name
            if name.endswith('weight'):
                new_name = name.replace('weight', 'scale')
            param_names[structured_name_prefix + name] = structured_name_prefix + new_name

            if data is not None:
                destination[structured_name_prefix + name] = data
        for name, buffer in self._buffers.items():
            if buffer is not None and name not in self._non_persistable_buffer_names_set:
                destination[structured_name_prefix + name] = buffer

        pickle.dump(param_names, open(param_path, 'wb'))
        if include_sublayers:
            for layer_name, layer_item in self._sub_layers.items():
                if layer_item is not None:
                    destination_temp = destination.copy()
                    destination_temp.update(
                        layer_item.state_dict(
                            destination_temp, include_sublayers,
                            structured_name_prefix + layer_name + "."))
                    destination = destination_temp
        return destination
